TripAdvisor Analysis

Code
import duckdb
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import plotly.express as px
import plotly.io as pio
Code
duck_con = duckdb.connect('tripadvisor.db')

Task 1: Understanding User Journeys

How many days pass from first event until first visit at TripAdvisor

Code
days_between_before_first_visit = duck_con.execute("""
WITH prebase_first_visit AS (
    SELECT userid,
        date_casted,
        event_order
    FROM clickstream2
    WHERE visit_tripadvisor = TRUE
        AND day_visited_number = 1
)
SELECT first_event.userid,
    first_event.date_casted first_date_event,
    prebase_first_visit.date_casted first_date_visit,
    DATE_DIFF('day', first_event.date_casted, prebase_first_visit.date_casted) days_between
FROM clickstream2 first_event
INNER JOIN prebase_first_visit ON first_event.userid = prebase_first_visit.userid
WHERE first_event.event_order = 1"""
).df()

# show the cumulative percent of users from day 1 to day 30 
all_days = days_between_before_first_visit['days_between'].value_counts().sort_index().reset_index()
all_days.columns = ['days_between', 'count']
all_days['cumulative_percent'] = round((all_days['count'].cumsum()/len(days_between_before_first_visit))*100, 2)

until_n_day = 60
only_30_days = all_days[all_days['days_between'] <= until_n_day]

# plot with plotly the cumulative percent of users from day 1 to day until_n_day
# Add a annotation only when the cumulative percent is 80%

fig = px.line(
    only_30_days,
    x="days_between",
    y="cumulative_percent",
    title="How many days pass between the first event and the first visit?",
)
# Set fig axis Y from 0 to 100 and X from 0 to 30
fig.update_yaxes(range=[0, 100])
fig.update_xaxes(range=[0, until_n_day])

fig.update_layout(
    xaxis_title="Days between first event and first visit",
    yaxis_title="Cumulative percent of users",
    showlegend=False,
)

users_80_percent = only_30_days[only_30_days["cumulative_percent"] >= 20]
x_value = users_80_percent["days_between"].min()
y_value = users_80_percent["cumulative_percent"].min()
fig.add_annotation(
    x=x_value,
    y=y_value,
    text=f"{round(y_value)}% of users have first visit in {x_value} days",
    showarrow=True,
    arrowhead=1,
)
fig.show(renderer="notebook")

How many days have passed since the session prior to visiting TripAdvisor?

Code
days_between_before_first_visit_and_last_session = duck_con.execute("""
WITH prebase_first_visit AS (
    SELECT userid,
        date_casted,
        event_order
    FROM clickstream2
    WHERE visit_tripadvisor = TRUE
        AND day_visited_number = 1
),
previous_session AS (
SELECT clickstream2.userid,
    MAX(clickstream2.date_casted) previous_date_casted
FROM clickstream2
INNER JOIN prebase_first_visit ON clickstream2.userid = prebase_first_visit.userid
WHERE clickstream2.date_casted < prebase_first_visit.date_casted
GROUP BY clickstream2.userid
)
SELECT prebase_first_visit.userid,
    previous_session.previous_date_casted,
    prebase_first_visit.date_casted first_date_visit,
    DATE_DIFF('day', previous_session.previous_date_casted, prebase_first_visit.date_casted) days_between
FROM prebase_first_visit 
INNER JOIN previous_session ON previous_session.userid = prebase_first_visit.userid"""
).df()

# show the cumulative percent of users from day 1 to day 30 
all_days = days_between_before_first_visit_and_last_session['days_between'].value_counts().sort_index().reset_index()
all_days.columns = ['days_between', 'count']
all_days['cumulative_percent'] = round((all_days['count'].cumsum()/len(days_between_before_first_visit_and_last_session))*100, 2)

until_n_day = 30
only_30_days = all_days[all_days['days_between'] <= until_n_day]

# plot with plotly the cumulative percent of users from day 1 to day until_n_day
# Add a annotation only when the cumulative percent is 80%

fig = px.line(
    only_30_days,
    x="days_between",
    y="cumulative_percent",
    title="How many days have passed since the session prior to visiting TripAdvisor?",
)
# Set fig axis Y from 0 to 100 and X from 0 to 30
fig.update_yaxes(range=[0, 100])
fig.update_xaxes(range=[0, until_n_day])

fig.update_layout(
    xaxis_title="Days between previous session and first visit",
    yaxis_title="Cumulative percent of users",
    showlegend=False,
)

# Set percent of 2 standard deviation
users_80_percent = only_30_days[only_30_days["cumulative_percent"] >= 95]
x_value = users_80_percent["days_between"].min()
y_value = users_80_percent["cumulative_percent"].min()
fig.add_annotation(
    x=x_value,
    y=y_value,
    text=f"{round(y_value)}% of users have first visit in {x_value} days",
    showarrow=True,
    arrowhead=1,
)
fig.show(renderer="notebook")

With this result, we see that we can create the sequence of the 5 days prior to the first visit to TripAdvisor

Most frequent urls in user journey before visit TripAdvisor

Code
duck_con.execute("""SELECT url, COUNT(0) as count 
FROM user_sequence
GROUP BY url
ORDER BY count DESC
LIMIT 10;
""").df()
url count
0 https://www.facebook.com/USER_NAME_REMOVED 75392
1 https://www.youtube.com/ 55745
2 https://www.google.com/ 34442
3 https://www.instagram.com/USER_NAME_REMOVED 24085
4 https://l.facebook.com/USER_NAME_REMOVED 15474
5 https://www.amazon.com/ 13472
6 https://www.yahoo.com/ 11081
7 https://duckduckgo.com/chrome_newtab 9101
8 https://www.linkedin.com/in/USER_REMOVED 8161
9 https://twitter.com/USER_NAME_REMOVED 7469

This result does not give us many insights, that is why we group by domain

Most frequent domains in user journey before visit TripAdvisor

Code
frequent_domains = duck_con.execute("""SELECT domain, COUNT(0) as total_events 
FROM user_sequence_domains
GROUP BY domain
ORDER BY total_events DESC
LIMIT 25;
""").df()

# Plot a tree map with plotly
fig = px.treemap(
    frequent_domains,
    path=["domain"],
    values="total_events",
    title="What are the 25 most frequent domains visited by users in his User Journey to TripAdvisor?",
)
fig.update_traces(
    textinfo="label+value+percent root",
    hovertemplate="<b>%{label}</b> <br> %{value} events",
)
fig.show(renderer="notebook")

What if we categorize this?

Code
child_treemap = duck_con.execute("""
WITH prebase_domains_categories AS (
    SELECT user_sequence_domains.domain,
        CASE WHEN category IS NULL THEN 'Other' ELSE category END AS category,
        COUNT(0) as total_events
FROM user_sequence_domains
LEFT JOIN websites_categories ON user_sequence_domains.domain = websites_categories.domain
GROUP BY user_sequence_domains.domain, 2
)
SELECT category, domain, CAST(total_events AS INT) as total_events
FROM prebase_domains_categories
ORDER BY total_events DESC
LIMIT 50;
""").df()

# Plot a tree map with plotly
fig = px.treemap(
    child_treemap,
    path=["category", "domain"],
    values="total_events",
    title="What are the 50 most frequent domains segmented by category visited by users?",
)

fig.update_traces(
    textinfo="label+value+percent parent",
    hovertemplate="<b>%{label}</b> <br> %{value} events <br> %{percentParent:.2f} of %{parent}",
)
fig.show(renderer="notebook")

Now let’s try with last event touch before landing TripAdvisor

Code
child_treemap = duck_con.execute("""
WITH prebase_domains_categories AS (
    SELECT user_sequence_domains.domain,
        CASE WHEN category IS NULL THEN 'Other' ELSE category END AS category,
        COUNT(0) as total_events
FROM user_sequence_domains
LEFT JOIN websites_categories ON user_sequence_domains.domain = websites_categories.domain
WHERE reverse_sequence = 1
GROUP BY user_sequence_domains.domain, 2
)
SELECT category, domain, CAST(total_events AS INT) as total_events
FROM prebase_domains_categories
ORDER BY total_events DESC
LIMIT 50;
""").df()

# Plot a tree map with plotly
fig = px.treemap(
    child_treemap,
    path=["category", "domain"],
    values="total_events",
    title="What are the 50 most frequent domains segmented by category visited by users?",
)

fig.update_traces(
    textinfo="label+value+percent parent",
    hovertemplate="<b>%{label}</b> <br> %{value} events <br> %{percentParent:.2f} of %{parent}",
)

fig.show(renderer="notebook")

Most frequent 10 User Journeys using the 5 steps above before getting to TripAdvisor

Code
most_frequent_user_journeys = duck_con.execute("""
WITH prebase_data AS (
SELECT userid,
    CASE WHEN category IS NULL THEN 'Other' ELSE category END AS category,
    user_sequence_domains.domain,
    reverse_sequence
FROM user_sequence_domains
LEFT JOIN websites_categories ON user_sequence_domains.domain = websites_categories.domain
WHERE reverse_sequence <= 5
),
-- Now transform table to get column category_1, domain_1, category_2, domain_2, etc. 
-- Instead of one row per user per sequence, have one row per user with all sequences
prebase_journeys AS (
SELECT 
    userid,
    MAX(CASE WHEN reverse_sequence = 5 THEN category ELSE NULL END) AS category_1,
    MAX(CASE WHEN reverse_sequence = 5 THEN domain ELSE NULL END) AS domain_1,
    MAX(CASE WHEN reverse_sequence = 4 THEN category ELSE NULL END) AS category_2,
    MAX(CASE WHEN reverse_sequence = 4 THEN domain ELSE NULL END) AS domain_2,
    MAX(CASE WHEN reverse_sequence = 3 THEN category ELSE NULL END) AS category_3,
    MAX(CASE WHEN reverse_sequence = 3 THEN domain ELSE NULL END) AS domain_3,
    MAX(CASE WHEN reverse_sequence = 2 THEN category ELSE NULL END) AS category_4,
    MAX(CASE WHEN reverse_sequence = 2 THEN domain ELSE NULL END) AS domain_4,
    MAX(CASE WHEN reverse_sequence = 1 THEN category ELSE NULL END) AS category_5,
    MAX(CASE WHEN reverse_sequence = 1 THEN domain ELSE NULL END) AS domain_5
FROM prebase_data
GROUP BY userid
)
SELECT domain_1,
    domain_2,
    domain_3,
    domain_4,
    domain_5,
    COUNT(0) AS total_users
FROM prebase_journeys
GROUP BY domain_1,
    domain_2,
    domain_3,
    domain_4,
    domain_5
ORDER BY total_users DESC
LIMIT 10;
""").df()

most_frequent_user_journeys
domain_1 domain_2 domain_3 domain_4 domain_5 total_users
0 google.com google.com google.com google.com google.com 63
1 NaN NaN NaN NaN google.com 25
2 google.com youtube.com google.com youtube.com google.com 19
3 google.com airbnb.com google.com airbnb.com google.com 17
4 google.com wikipedia.org google.com wikipedia.org google.com 15
5 yahoo.com yahoo.com yahoo.com yahoo.com yahoo.com 15
6 chase.com chase.com chase.com chase.com google.com 13
7 google.com expedia.com google.com expedia.com google.com 10
8 google.com yelp.com google.com yelp.com google.com 9
9 google.com facebook.com google.com facebook.com google.com 9
Code
most_frequent_user_journeys = duck_con.execute("""
WITH prebase_data AS (
SELECT userid,
    CASE WHEN category IS NULL THEN 'Other' ELSE category END AS category,
    user_sequence_domains.domain,
    reverse_sequence
FROM user_sequence_domains
LEFT JOIN websites_categories ON user_sequence_domains.domain = websites_categories.domain
WHERE reverse_sequence <= 5
),
-- Now transform table to get column category_1, domain_1, category_2, domain_2, etc. 
-- Instead of one row per user per sequence, have one row per user with all sequences
prebase_journeys AS (
SELECT 
    userid,
    MAX(CASE WHEN reverse_sequence = 5 THEN category ELSE NULL END) AS category_1,
    MAX(CASE WHEN reverse_sequence = 5 THEN domain ELSE NULL END) AS domain_1,
    MAX(CASE WHEN reverse_sequence = 4 THEN category ELSE NULL END) AS category_2,
    MAX(CASE WHEN reverse_sequence = 4 THEN domain ELSE NULL END) AS domain_2,
    MAX(CASE WHEN reverse_sequence = 3 THEN category ELSE NULL END) AS category_3,
    MAX(CASE WHEN reverse_sequence = 3 THEN domain ELSE NULL END) AS domain_3,
    MAX(CASE WHEN reverse_sequence = 2 THEN category ELSE NULL END) AS category_4,
    MAX(CASE WHEN reverse_sequence = 2 THEN domain ELSE NULL END) AS domain_4,
    MAX(CASE WHEN reverse_sequence = 1 THEN category ELSE NULL END) AS category_5,
    MAX(CASE WHEN reverse_sequence = 1 THEN domain ELSE NULL END) AS domain_5
FROM prebase_data
GROUP BY userid
)
SELECT category_1,
    category_2,
    category_3,
    category_4,
    category_5,
    COUNT(0) AS total_users
FROM prebase_journeys
GROUP BY category_1,
    category_2,
    category_3,
    category_4,
    category_5
ORDER BY total_users DESC
LIMIT 10;
""").df()

most_frequent_user_journeys
category_1 category_2 category_3 category_4 category_5 total_users
0 Search Engines Other Search Engines Other Search Engines 2254
1 Other Other Search Engines Other Search Engines 1138
2 Other Other Other Other Search Engines 906
3 Other Search Engines Other Other Search Engines 814
4 Other Other Other Other Other 516
5 Other Search Engines Other Search Engines Other 513
6 Search Engines Other Other Other Search Engines 462
7 Other Other Other Search Engines Other 313
8 Other Search Engines Search Engines Other Search Engines 215
9 Search Engines Other Other Search Engines Other 205

Size of User Journeys

Code
domain_journeys = duck_con.execute(
    """SELECT userid,
    COUNT(0) AS total_steps
FROM user_sequence_domains
GROUP BY userid;"""
).df()

# Plot a histogram with plotly
fig = px.histogram(
    domain_journeys,
    x="total_steps",
    title="How many steps do users take in their journeys?",
)
fig.update_layout(
    xaxis_title="Number of steps",
    yaxis_title="Number of users",
    showlegend=False,
)
fig.show(renderer="notebook")

How many steps we need to track to get 80% and 95% of User Journeys?

Code
# Calculate the cumulative percent of users  
all_events = domain_journeys['total_steps'].value_counts().sort_index().reset_index()
all_events.columns = ['total_steps', 'count']
all_events['cumulative_percent'] = round((all_events['count'].cumsum()/len(domain_journeys))*100, 2)

# plot with plotly the cumulative percent of users from day 1 to day until_n_day
# Add a annotation only when the cumulative percent is 80% and 95%
fig = px.line(
    all_events,
    x="total_steps",
    y="cumulative_percent",
    title="How many events do users take in their journeys?",
)
# Set fig axis Y from 0 to 100
fig.update_yaxes(range=[0, 100])


fig.update_layout(
    xaxis_title="Number of events",
    yaxis_title="Cumulative percent of users",
    showlegend=False,
)

users_80_percent = all_events[all_events["cumulative_percent"] >= 80]
x_value = users_80_percent["total_steps"].min()
y_value = users_80_percent["cumulative_percent"].min()
fig.add_annotation(
    x=x_value,
    y=y_value,
    text=f"{round(y_value)}% of users have {x_value} events",
    showarrow=True,
    arrowhead=1,
)

users_95_percent = all_events[all_events["cumulative_percent"] >= 95]
x_value = users_95_percent["total_steps"].min()
y_value = users_95_percent["cumulative_percent"].min()
fig.add_annotation(
    x=x_value,
    y=y_value,
    text=f"{round(y_value)}% of users have {x_value} events",
    showarrow=True,
    arrowhead=1,
)

fig.show(renderer="notebook")

What is the longest User Journey?

Code
# Get the User Journey with the most steps
domain_journeys.sort_values(by="total_steps", ascending=False).head(1)
userid total_steps
8510 1ebe1524-4ea7-481c-a054-0b73b861dfc0 3562